'''
LastEditors: 杜康
LastEditTime: 2021-09-28 13:01:34
'''
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()


def getLinks(pageUrl):
    global pages
    html = urlopen('https://en.wikipedia.org/wiki/Main_Page{}'.format(pageUrl))
    # html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    # try:
    # print(bs.h1.get_text())
    # print(bs.find(id='mw-content-text').find_all('a')[0])
    # print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    # except AttributeError:
    # print('页面缺少一些属性！不过不用担心！')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print('-' * 20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)


getLinks('')
